@inproceedings{lessons,
   title={{Lessons Learned from Contrasting BLAS Kernel Implementations}},
   author={More, Andres},
   booktitle={XVIII Congreso Argentino de Ciencias de la Computacion},
   year={2013}
}

@book{xeonphi,
 author = {Jeffers, James and Reinders, James},
 title = {Intel Xeon Phi Coprocessor High Performance Programming},
 year = {2013},
 isbn = {9780124104143, 9780124104945},
 edition = {1st},
 publisher = {Morgan Kaufmann Publishers Inc.},
 address = {San Francisco, CA, USA},
} 

@article{latency,
  title={Optimizing Latency in Beowulf Clusters},
  author={Garabato, Rafael and More, Andr{\'e}s and Rosales, Victor},
  journal={CLEI Electronic Journal},
  volume={15},
  number={3},
  pages={3--3},
  year={2012},
  publisher={Centro Latinoamericano de Estudios en Inform{\'a}tica}
}

@techreport{mm-tool,
    author = {{Andres More}},
    title = {{A Case Study on High Performance Matrix Multiplication}},
    url = {https://code.google.com/p/mm-matrixmultiplicationtool/},
    year = {2008}
}

@book{is-parallel-programming-hard,
    author = {McKenney, Paul E.},
    publisher = {kernel.org},
    title = {Is Parallel Programming Hard, And, If So, What Can You Do About It?},
    url = {http://kernel.org/pub/linux/kernel/people/paulmck/perfbook/perfbook.html},
    year = {2010}
}

@INPROCEEDINGS{beowulf-parallel-workstation,
    author = {Thomas Sterling and Donald J. Becker and Daniel Savarese and John E. Dorband and Udaya A. Ranawake and Charles V. Packer},
    title = {Beowulf: A Parallel Workstation For Scientific Computation},
    booktitle = {In Proceedings of the 24th International Conference on Parallel Processing},
    year = {1995},
    pages = {11--14},
    publisher = {CRC Press}
}

@techreport{mpi-standard,
    institution = {University of Tennessee},
    month = may,
    title = {{MPI}: A {Message-Passing} Interface Standard},
    year = {1994}
}

@manual{openmp-api,
    author = {{OpenMP Architecture Review Board}},
    edition = {3.0},
    month = may,
    title = {{OpenMP} Application Program Interface},
    url = {http://www.openmp.org},
    year = {2008}
}

@techreport{tinetti,
    author = {{Fernando G. Tinetti}},
    title = {{Cómputo Paralelo en Redes Locales de Computadoras}},
    year = {2004}
}

@article{gprof,
 author = {Graham, Susan L. and Kessler, Peter B. and McKusick, Marshall K.},
 title = {gprof: a call graph execution profiler},
 journal = {SIGPLAN Not.},
 volume = {39},
 number = {4},
 month = apr,
 year = {2004},
 pages = {49--57},
 numpages = {9},
 publisher = {ACM},
 address = {New York, NY, USA},
} 
   
@book{hennessy,
 author = {Hennessy, John L. and Patterson, David A.},
 title = {Computer architecture: a quantitative approach},
 year = {2002},
 isbn = {1-55860-596-7},
 edition = {3rd},
 publisher = {Morgan Kaufmann Publishers Inc.},
 address = {San Francisco, CA, USA},
} 

@book{intel-optimization,
	author        = {{Intel Corporation}},
	title         = {{Intel\textsuperscript{\textregistered} 64 and IA-32 Architectures Optimization Reference Manual}},
	year          = {2009},
	month         = {March},
	number        = {248966-018},
}

@techreport{memory,
    author = {Drepper, Ulrich},
    month = nov,
    title = {{What Every Programmer Should Know About Memory}},
    url = {http://people.redhat.com/drepper/cpumemory.pdf},
    year = {2007}
}

@techreport{gregg,
    author = {Brendan Gregg},
    month = feb,
    title = {{Linux Performance Analysis and Tools}},
    url = {http://es.slideshare.net/brendangregg/linux-performance-analysis-and-tools},
    year = {2013}
}

@book{patterns,
 author = {Mattson, Timothy and Sanders, Beverly and Massingill, Berna},
 title = {Patterns for parallel programming},
 year = {2004},
 isbn = {0321228111},
 edition = {First},
 publisher = {Addison-Wesley Professional},
} 
  
@incollection{automatic,
 author = {Margalef, Tom\`{a}s and Jorba, Josep and Morajko, Oleg and Morajko, Anna and Luque, Emilio},
 chapter = {Different approaches to automatic performance analysis of distributed applications},
 title = {Performance analysis and grid computing},
 editor = {Getov, Vladimir and Gerndt, Michael and Hoisie, Adolfy and Malony, Allen and Miller, Barton},
 year = {2004},
 isbn = {1-4020-7693-2},
 pages = {3--19},
 numpages = {17},
 publisher = {Kluwer Academic Publishers},
 address = {Norwell, MA, USA},
 keywords = {automatic performance analysis, distributed computing, dynamic tuning},
} 
  
@inproceedings{capturing,
 author = {Huck, Kevin A. and Hernandez, Oscar and Bui, Van and Chandrasekaran, Sunita and Chapman, Barbara and Malony, Allen D. and McInnes, Lois Curfman and Norris, Boyana},
 title = {Capturing performance knowledge for automated analysis},
 booktitle = {Proceedings of the 2008 ACM/IEEE conference on Supercomputing},
 series = {SC '08},
 year = {2008},
 isbn = {978-1-4244-2835-9},
 location = {Austin, Texas},
 pages = {49:1--49:10},
 articleno = {49},
 numpages = {10},
 url = {http://dl.acm.org/citation.cfm?id=1413370.1413420},
 acmid = {1413420},
 publisher = {IEEE Press},
 address = {Piscataway, NJ, USA},
} 
  
@article{hybrid,
 author = {Wolf, Felix and Mohr, Bernd},
 title = {Automatic performance analysis of hybrid MPI/OpenMP applications},
 journal = {J. Syst. Archit.},
 issue_date = {November 2003},
 volume = {49},
 number = {10-11},
 month = nov,
 year = {2003},
 issn = {1383-7621},
 pages = {421--439},
 numpages = {19},
 url = {http://dx.doi.org/10.1016/S1383-7621(03)00102-4},
 doi = {10.1016/S1383-7621(03)00102-4},
 acmid = {967618},
 publisher = {Elsevier North-Holland, Inc.},
 address = {New York, NY, USA},
 keywords = {event tracing, parallel computing, performance analysis, user interface},
}
  
@inproceedings{intro,
 author = {Smith, Connie U.},
 title = {Introduction to software performance engineering: origins and outstanding problems},
 booktitle = {Proceedings of the 7th international conference on Formal methods for performance evaluation},
 series = {SFM'07},
 year = {2007},
 isbn = {978-3-540-72482-7},
 location = {Bertinoro, Italy},
 pages = {395--428},
 numpages = {34},
 url = {http://dl.acm.org/citation.cfm?id=1768017.1768027},
 acmid = {1768027},
 publisher = {Springer-Verlag},
 address = {Berlin, Heidelberg},
 keywords = {SPE, SPE process, performance antipattterns, performance models, performance patterns, performance prediction, queueing network models, software performance engineering},
} 

@inproceedings{future,
 author = {Woodside, Murray and Franks, Greg and Petriu, Dorina C.},
 title = {The Future of Software Performance Engineering},
 booktitle = {2007 Future of Software Engineering},
 series = {FOSE '07},
 year = {2007},
 isbn = {0-7695-2829-5},
 pages = {171--187},
 numpages = {17},
 url = {http://dx.doi.org/10.1109/FOSE.2007.32},
 doi = {10.1109/FOSE.2007.32},
 acmid = {1254717},
 publisher = {IEEE Computer Society},
 address = {Washington, DC, USA},
}

@inproceedings{overview,
 author = {Browne, J. C.},
 title = {A critical overview of computer performance evaluation},
 booktitle = {Proceedings of the 2nd international conference on Software engineering},
 series = {ICSE '76},
 year = {1976},
 location = {San Francisco, California, USA},
 pages = {138--145},
 numpages = {8},
 url = {http://dl.acm.org/citation.cfm?id=800253.807665},
 acmid = {807665},
 publisher = {IEEE Computer Society Press},
 address = {Los Alamitos, CA, USA},
 keywords = {Data base systems, Measurement, Modeling, Networks, Performance evaluation, Scheduling, Work and capacity},
} 

\bibitem{hpctoolkit}
  Rice University,
  \emph{HPC Toolkit}, {\tt http://hpctoolkit.org}.
       
\bibitem{papi}
  University of Tennessee,
  \emph{Performance Application Programming Interface},
       {\tt http://icl.cs.utk.edu/papi}.
       
@inproceedings{amdahl,
 author = {Amdahl, Gene M.},
 title = {Validity of the single processor approach to achieving large scale computing capabilities},
 booktitle = {Proceedings of the April 18-20, 1967, spring joint computer conference},
 series = {AFIPS '67 (Spring)},
 year = {1967},
 location = {Atlantic City, New Jersey},
 pages = {483--485},
 numpages = {3},
 url = {http://doi.acm.org/10.1145/1465482.1465560},
 doi = {10.1145/1465482.1465560},
 acmid = {1465560},
 publisher = {ACM},
 address = {New York, NY, USA},
} 
  
@Proceedings{mandel,
  editor =       "Beno{\^\i}t B. Mandelbrot and Dann E. Passoja",
  booktitle =    "{Fractal aspects of materials: metal and catalyst
                 surfaces, powders and aggregates: extended abstracts}",
  title =        "{Fractal aspects of materials: metal and catalyst
                 surfaces, powders and aggregates: extended abstracts}",
  volume =       "EA--4",
  publisher =    "Materials Research Society",
  address =      "Pittsburgh, PA, USA",
  pages =        "v + 47",
  year =         "1984",
  LCCN =         "QA447 .F72 1984",
  bibdate =      "Mon Sep 10 14:59:48 MDT 2012",
  bibsource =    "clas.caltech.edu:210/INNOPAC;
                 http://www.math.utah.edu/pub/bibnet/authors/m/mandelbrot-benoit.bib",
  series =       "Materials Research Society extended abstracts",
  acknowledgement = ack-nhfb,
  remark =       "Proceedings of Symposium P, 1984 Fall Meeting of the
                 Materials Research Society, November 26--27, 1984,
                 Boston Marriott Hotel at Copley Place, Boston,
                 Massachusetts.",
  subject =      "Geometry; Congresses; Fractals; Surfaces
                 (Technology)",
}

\bibitem{twelve-ways}
  D. Bailey, \emph{Twelve Ways to Fool the Masses When Giving Performance
    Results on Parallel Computers},
  RNR Technical Report, RNR-90-020, NASA Ames Research Center, 1991.
  
% modern version of that

@ARTICLE{gustafson,
    author = {John L. Gustafson},
    title = {{Reevaluating Amdahl's Law}},
    journal = {Communications of the ACM},
    year = {1988},
    volume = {31},
    pages = {532--533}
}  

@article{karp-flatt,
 author = {Karp, Alan H. and Flatt, Horace P.},
 title = {Measuring parallel processor performance},
 journal = {Commun. ACM},
 volume = {33},
 number = {5},
 month = may,
 year = {1990},
 issn = {0001-0782},
 pages = {539--543},
 numpages = {5},
 publisher = {ACM},
 address = {New York, NY, USA},
}     

@article{gpu-myth,
 author = {Lee, Victor W. and Kim, Changkyu and Chhugani, Jatin and Deisher, Michael and Kim, Daehyun and Nguyen, Anthony D. and Satish, Nadathur and Smelyanskiy, Mikhail and Chennupaty, Srinivas and Hammarlund, Per and Singhal, Ronak and Dubey, Pradeep},
 title = {Debunking the 100X GPU vs. CPU myth: an evaluation of throughput computing on CPU and GPU},
 journal = {SIGARCH Comput. Archit. News},
 volume = {38},
 number = {3},
 month = jun,
 year = {2010},
 issn = {0163-5964},
 pages = {451--460},
 numpages = {10},
 publisher = {ACM},
 address = {New York, NY, USA},
} 

@ARTICLE{stream,
    author = {McCalpin, J. D.},
     month = {Dec},
     title = {Memory Bandwidth and Machine Balance in Current High Performance Computers},
   journal = {IEEE Technical Committee on Computer Architecture (TCCA) Newsletter},
      year = {1995}
}  

@INPROCEEDINGS{counters,
    author = {Dong H. Ahn and Jeffrey S. Vetter},
    title = {Scalable analysis techniques for microprocessor performance counter metrics},
    booktitle = {In Proc. of the Conference on Supercomputers (SC2002},
    year = {2002},
    pages = {1--16},
    publisher = {IEEE Computer Society Press}
}  

@book{linpack,
    author = {Dongarra, J. J. and Moler, C. B. and Bunch, J. R. and Stewart, G. W.},
    publisher = {SIAM},
    title = {{LINPACK} User's Guide},
    year = {1979}
}
  
@electronic{hpl,
    address = {http://www.netlib.org/benchmark/hpl/},
    author = {Petitet, A. and Whaley, R. C. and Dongarra, Jack and Cleary, A.},
    title = {{HPL} - A Portable Implementation of the {High-Performance} Linpack Benchmark for {Distributed-Memory} Computers}
}

@Article{matplotlib,
  Author    = {Hunter, J. D.},
  Title     = {Matplotlib: A 2D graphics environment},
  Journal   = {Computing In Science \& Engineering},
  Volume    = {9},
  Number    = {3},
  Pages     = {90--95},
  abstract  = {Matplotlib is a 2D graphics package used for Python
  for application development, interactive scripting, and
  publication-quality image generation across user
  interfaces and operating systems.},
  publisher = {IEEE COMPUTER SOC},
  year      = 2007
}

@TECHREPORT{hpcc,
    author = {Piotr Luszczek and Jack J. Dongarra and David Koester and Rolf Rabenseifner and Bob Lucas and Jeremy Kepner and John Mccalpin and David Bailey and Daisuke Takahashi},
    title = {{Introduction to the HPC Challenge Benchmark Suite}},
    year = {2005}
}  

@article{latency,
  author    = {Rafael Garabato and
               Victor Rosales and
               Andres More},
  title     = {{Optimizing Latency in Beowulf Clusters}},
  journal   = {CLEI Electron. J.},
  volume    = {15},
  number    = {3},
  year      = {2012},
  ee        = {http://www.clei.cl/cleiej/paper.php?id=248},
}

@article{how-not-to-lie,
 author = {Fleming, Philip J. and Wallace, John J.},
 title = {How not to lie with statistics: the correct way to summarize benchmark results},
 journal = {Commun. ACM},
 volume = {29},
 number = {3},
 month = mar,
 year = {1986},
 issn = {0001-0782},
 pages = {218--221},
 numpages = {4},
 publisher = {ACM},
 address = {New York, NY, USA},
} 

@book{numerical-analysis,
    author = {Atkinson, Kendall},
    edition = {2},
    isbn = {0471624896},
    publisher = {Wiley},
    title = {An Introduction to Numerical Analysis},
    url = {http://www.amazon.com/exec/obidos/redirect?tag=citeulike07-20\&path=ASIN/0471624896}
}

@article{conway,
    author = {Gardner, Martin},
    journal = {Scientific American},
    month = oct,
    pages = {120--123},
    posted-at = {2008-12-03 13:54:53},
    title = {{Mathematical Games: The fantastic combinations of John Conway's new solitaire game 'life'}},
    year = {1970}
}

@book{kiviat, 
	author = "Chambers, J. M., Cleveland, W. S., Kleiner, B. and Tukey, P. A.", 
	year = 1983, 
	title = "Graphical Methods for Data Analysis", 
	address = "Belmont, CA", 
	publisher = "Wadsworth"
}

@Article{matplotlib,
  Author    = {Hunter, J. D.},
  Title     = {Matplotlib: A 2D graphics environment},
  Journal   = {Computing In Science \& Engineering},
  Volume    = {9},
  Number    = {3},
  Pages     = {90--95},
  abstract  = {Matplotlib is a 2D graphics package used for Python
  for application development, interactive scripting, and
  publication-quality image generation across user
  interfaces and operating systems.},
  publisher = {IEEE COMPUTER SOC},
  year      = 2007
}

@article{numpy,
 author = {Walt, Stefan van der and Colbert, S. Chris and Varoquaux, Gael},
 title = {The NumPy Array: A Structure for Efficient Numerical Computation},
 journal = {Computing in Science and Engg.},
 issue_date = {March 2011},
 volume = {13},
 number = {2},
 month = mar,
 year = {2011},
 issn = {1521-9615},
 pages = {22--30},
 numpages = {9},
 url = {http://dx.doi.org/10.1109/MCSE.2011.37},
 doi = {10.1109/MCSE.2011.37},
 acmid = {1957466},
 publisher = {IEEE Educational Activities Department},
 address = {Piscataway, NJ, USA},
 keywords = {NumPy, Python, Python, NumPy, scientific programming, numerical computations, programming libraries, numerical computations, programming libraries, scientific programming},
} 
